Lab11
Confidence Intervals for Z-test
Example 1: C.I for Z-test
n = 30 # sample size
alpha = .90 # Desired coverage probability
x.1 <- rnorm(n) # Make a single sample
#z.1 = qnorm((1 + alpha)/2, lower.tail = T)
z.1 = qnorm((1 - alpha)/2, lower.tail = F) # find z-star
z.1## [1] 1.644854
qnorm((1 + alpha)/2) # alternative formula ## [1] 1.644854
# Make a 90% CI for the mean
c(mean(x.1) - z.1*1/sqrt(n), mean(x.1) + z.1*1/sqrt(n))## [1] -0.54371561 0.05690001
t-C.Is
Example 2:
# Make a t-confidence interval "by hand"
xbar = 185 # given in problem
SD = 45 # given in problem
n = 40 # sample size given in problem
SE = SD/sqrt(n)
qt(.95,n-1)## [1] 1.684875
qnorm(.95)## [1] 1.644854
L = xbar - qt(.95,39)*SE
U = xbar + qt(.95,39)*SE
# Here is the CI:
L## [1] 173.0119
U## [1] 196.9881
Example 3:
Make a single CI, using the t.test() function in R
# Sample size n = 10
n = 10 # sample size
# Make a single CI, using the t.test() function in R
x.2 <- rnorm(n)
t.test(x.2) -> mytest.95 # default confidence level = 0.95
mytest.95##
## One Sample t-test
##
## data: x.2
## t = -0.69429, df = 9, p-value = 0.505
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -1.0471856 0.5553425
## sample estimates:
## mean of x
## -0.2459216
names(mytest.95)## [1] "statistic" "parameter" "p.value" "conf.int" "estimate"
## [6] "null.value" "stderr" "alternative" "method" "data.name"
mytest.95$method## [1] "One Sample t-test"
mytest.95$statistic## t
## -0.6942945
# Extract the CI:
mytest.95$conf.int # this is the CI## [1] -1.0471856 0.5553425
## attr(,"conf.level")
## [1] 0.95
# Redo with confidence level 90%:
t.test(x.2, conf.level = .9) -> mytest.9
mytest.9##
## One Sample t-test
##
## data: x.2
## t = -0.69429, df = 9, p-value = 0.505
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
## -0.8952166 0.4033735
## sample estimates:
## mean of x
## -0.2459216
Example 4:
Now simulate 10000 t-based CIs, confidence level = 95%
N = 10000 # Number of samples for the simulation
gamma = .95 # Confidence level = desired coverage probability
n = 5 # Sample size
mydf.3 = data.frame(L = rep(NA,N), U = rep(NA,N))
for (j in 1:N){
x.3 <- rnorm(n)
mytest <- t.test(x.3, conf.level = gamma)
mydf.3$L[j] = mytest$conf.int[1]
mydf.3$U[j] = mytest$conf.int[2]
}
# Find fraction of cases where the CI is too far to the right:
mean(mydf.3$L > 0) # proability that the CI is too far to the right## [1] 0.028
# Find fraction of cases where the CI is too far to the left:
mean(mydf.3$U < 0) # proability that the CI is too far to the left## [1] 0.0226
# Probabilities add to about 5%. You can see the coverage is pretty good. It is early 0.025 from both sides. Note that we are sampling form a normal distribution.
Example 5:
Repeat this with samples of size 5 from an exponential distribution.
gamma = .95 # Coverage probability
n = 5 # Sample size
mydf.4 = data.frame(L = rep(NA,N), U = rep(NA,N))
for (j in 1:N){
x.4 <- rexp(n)
mytest.4 <- t.test(x.4, conf.level = gamma)
mydf.4$L[j] = mytest.4$conf.int[1]
mydf.4$U[j] = mytest.4$conf.int[2]
}
mean(mydf.4$L > 1) #lambda is 1## [1] 0.0046
mean(mydf.4$U < 1) #too far to the left = 11%## [1] 0.1173
The coverage probability is much less than 95%. Too many confidence intervals are too far to the left to contain the correct value.
Segment plot to examine this:
plot(x = c(-2, 4), y = c(1, 100), type = "n", xlab = "",
ylab = "",
main = "100 CI's, n=5, exponential, gamma = .95")
for (j in 1:100){
segments(mydf.4$L[j], j, mydf.4$U[j], j)
}
abline(v = 1, lwd = 2, col = 2)Most of the C.I’s are large and a little bit off.
Example 6:
Repeat with larger sample size.
n = 50 #larger sample size
for (j in 1:N){
x.4 <- rexp(n)
mytest.4 <- t.test(x.4, conf.level = gamma)
mydf.4$L[j] = mytest.4$conf.int[1]
mydf.4$U[j] = mytest.4$conf.int[2]
}
mean(mydf.4$L > 1)## [1] 0.01
mean(mydf.4$U < 1)## [1] 0.0548
# This is better but still too high. it should be 5% but it's
#a littlr bit higher than 5
# Segment plot
plot(x = c(0, 2), y = c(1, 100), type = "n", xlab = "",
ylab = "",
main = "100 CI's, n = 50, exponential, gamma = .95")
for (j in 1:100){
segments(mydf.4$L[j], j, mydf.4$U[j], j)
}
abline(v = 1, lwd = 2, col = 2)Example 7: Now do this for a uniform distribution.
This is a symmetric distribution with only one peak. the t- CI give good coverage already for small sample sizes.
gamma = .95 # Coverage probability
n = 10 # Sample size
mydf.5 <- mydf.4
for (j in 1:N){
x.5 <- runif(n)
mytest.5 <- t.test(x.5, conf.level = gamma)
mydf.5$L[j] = mytest.5$conf.int[1]
mydf.5$U[j] = mytest.5$conf.int[2]
}
mean(mydf.5$L > .5) #2.7%## [1] 0.0252
mean(mydf.5$U < .5)#2.7% so all together about 5.5% = acceptable## [1] 0.027
# Good coverage probability.Let’s confirmed it by a segment plot of 100 CIs.
plot(x = c(0, 1), y = c(1, 100), type = "n", xlab = "",
ylab = "",
main = "100 CI's, n = 10, uniform, gamma = .95")
for (j in 1:100){
segments(mydf.5$L[j], j, mydf.5$U[j], j)
}
abline(v = .5, lwd = 2, col = 2)Example 8: Using actual data “Girls2004.csv”
Girls2004 <- read.csv("Girls2004.csv", stringsAsFactors=FALSE)
head(Girls2004)## ID State MothersAge Smoker Weight Gestation
## 1 1 WY 15-19 No 3085 40
## 2 2 WY 35-39 No 3515 39
## 3 3 WY 25-29 No 3775 40
## 4 4 WY 20-24 No 3265 39
## 5 5 WY 25-29 No 2970 40
## 6 6 WY 20-24 No 2850 38
# Explore dependence of Weight on state and smoking history.
boxplot(Weight ~ State, data = Girls2004)boxplot(Weight ~ Smoker, data = Girls2004)boxplot(Weight ~ Smoker*State, data = Girls2004)
library(ggplot2)(MyL1<-ggplot(Girls2004, aes(x=State, y=Weight,fill=State))+
geom_boxplot()+
geom_jitter(position=position_jitter(.01), aes(color=State))+
ggtitle("Weights of new born Girls by State"))p<-ggplot(Girls2004, aes(x=State, y=Weight, fill=State))+ geom_boxplot()
# Extract data from the two states
girlsWY <- Girls2004$Weight[Girls2004$State == "WY"]
girlsAK <- Girls2004$Weight[Girls2004$State == "AK"]Make a CI for the mean weight of girl babies in Wyoming and Alaska
mytest <- t.test(girlsWY, conf.level = .95, alternative = "two.sided")
mytest##
## One Sample t-test
##
## data: girlsWY
## t = 48.5, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 3074.115 3341.685
## sample estimates:
## mean of x
## 3207.9
mytest$conf.int## [1] 3074.115 3341.685
## attr(,"conf.level")
## [1] 0.95
We are 95% confident that the average weight of girl babies in Wyoming is between 3074 grams and 3341 grams.
Same thing for data from Alaska
mytest.AK <- t.test(girlsAK, conf.level = .95,
alternative = "two.sided")
mytest.AK##
## One Sample t-test
##
## data: girlsAK
## t = 38.421, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 3331.23 3701.47
## sample estimates:
## mean of x
## 3516.35
We are 95% confident that the average weight of girl babies in Alaska is between 3331 grams and 3701 grams.
Make a CI for the difference of means.
mytest.a <- t.test(girlsAK, girlsWY,
conf.level = .9,
alternative = "two.sided")
mytest.a##
## Welch Two Sample t-test
##
## data: girlsAK and girlsWY
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means is not equal to 0
## 90 percent confidence interval:
## 120.2575 496.6425
## sample estimates:
## mean of x mean of y
## 3516.35 3207.90
# Note the Welch estimate for the degrees of freedom
# which results in a non-integer value. We are 95% confident that the average weight of girl babies in Arkansas is higher 120 g to 496 g than that is in Wyoming.
Alternative way to construct the test:
mytest.b <- t.test(Weight ~ State, data = Girls2004, conf.level = .9) #90% C.I
mytest.b##
## Welch Two Sample t-test
##
## data: Weight by State
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means between group AK and group WY is not equal to 0
## 90 percent confidence interval:
## 120.2575 496.6425
## sample estimates:
## mean in group AK mean in group WY
## 3516.35 3207.90
# Change the confidence level:
mytest.b <- t.test(Weight ~ State, data = Girls2004, conf.level = .95) #95% C.I
mytest.b##
## Welch Two Sample t-test
##
## data: Weight by State
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means between group AK and group WY is not equal to 0
## 95 percent confidence interval:
## 83.29395 533.60605
## sample estimates:
## mean in group AK mean in group WY
## 3516.35 3207.90
One Sided C.I
Import the data set Titanic.csv which contains survival data (0 = death, 1 = survival) and ages of 658 passengers of the Titanic which sank on April 15, 1912. Examine the null hypothesis that the mean ages of survivors and of victims are the same against the alternative that these mean ages are different, using a t-test.
Compute the confidence Interval for the true mean difference?
Import the data. There are 135 survivors and 523 victims in this data set. We can make a side by side boxplot and see that the age distributions of survivors and victims are very similar, but survivors have a slightly smaller median. The mean age of survivors is 26.98 and the mean age of victims is 31.52. The difference is 4.54.
Titanic <- read.csv("Titanic.csv", stringsAsFactors=FALSE)
head(Titanic)## ID Survived Age
## 1 1 1 0.92
## 2 2 0 30.00
## 3 3 1 48.00
## 4 4 0 39.00
## 5 5 0 71.00
## 6 6 0 47.00
boxplot(Age ~ Survived, data = Titanic)#let's look at the means
agg.df <- aggregate(Age ~ Survived, data = Titanic, FUN = mean)
print(agg.df)## Survived Age
## 1 0 31.51641
## 2 1 26.97778
(mean.diff <- agg.df[1,2] - agg.df[2,2])## [1] 4.538628
Let \(\mu_S =\) population mean age of survivors and \(\mu_v =\) population mean age of victims.
The null hypothesis is \(H_0: \mu_V - \mu_S = 0\) and the alternative is \(H_a: \mu_V - \mu_S > 0\).
survivors <- subset(Titanic, select=Age, subset=Survived=="1", drop=T)
victims <- subset(Titanic, select=Age,subset=Survived=="0", drop=T)
t.test(victims, survivors, alt="greater")##
## Welch Two Sample t-test
##
## data: victims and survivors
## t = 3.091, df = 191.92, p-value = 0.001146
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 2.111742 Inf
## sample estimates:
## mean of x mean of y
## 31.51641 26.97778
###or
t.test(Age ~ Survived, data = Titanic,alternative = "greater")$conf## [1] 2.111742 Inf
## attr(,"conf.level")
## [1] 0.95
With \(95\%\) confidence it can be concluded that the true mean age of victims is at least 2 years more than the mean age of survivors.
Comparison of Bootstrap CI and t-based CI
Example 9: Use the Verizon data
bootsrap C.I
Verizon <- read.csv("Verizon.csv", stringsAsFactors=FALSE)
head(Verizon)## Time Group
## 1 17.50 ILEC
## 2 2.40 ILEC
## 3 0.00 ILEC
## 4 0.65 ILEC
## 5 22.23 ILEC
## 6 1.20 ILEC
# Separate the two groups.
VerizonI <- Verizon$Time[Verizon$Group == "ILEC"]
VerizonC <- Verizon$Time[Verizon$Group == "CLEC"]
# First look at the ILEC data (sample size = 1664)
# Make a 95% CI using the bootstrap for verizon customers
boot.mean.VerizonI <- replicate(10000, mean(sample(VerizonI,
replace = T)))
quantile(boot.mean.VerizonI, c(.025, .975))## 2.5% 97.5%
## 7.716504 9.138038
# Bootstrap: for customers from other companies
boot.mean.VerizonC <- replicate(10000, mean(sample(VerizonC,
replace = T)))
quantile(boot.mean.VerizonC, c(.025, .975))## 2.5% 97.5%
## 10.12642 25.62305
Make a 95% CI for the mean repair time using t.test
t.test(VerizonI, conf.level = .95)$conf.int## [1] 7.705276 9.117945
## attr(,"conf.level")
## [1] 0.95
# The two CIs are essentially the same (large sample size)
# Now do this for the CLEC data.
t.test(VerizonC, conf.level = .95)$conf.int## [1] 8.075152 24.943109
## attr(,"conf.level")
## [1] 0.95
Remember that the sample size was large for their own customers(1664) but very low sample size for customers from other companies(23). Therefore, it is questionable to use t-test for the CLEC data.
So Which one should we use?
Example 10: Which one should we believe?
Compare bootstrap CI and t.test CI for data from a known distribution that is skewed.
Exponential distribution, lambda = 1, sample size = 10.
Replicate CIs for the pop. mean for 1000 samples, with confidence level 95%.
# Fraction of CIs that contains 1 = simulated coverage probability.
# Put everything in one data frame.
mydf.6 <- data.frame(L.t = rep(NA,1000))
mydf.6$U.t <- NA
mydf.6$L.boot <- NA
mydf.6$U.boot <- NA
# make 1000 samples,
# find t.test CI for case,
# make bootstrap CI for each case,
n = 10
alpha = .9
for (j in 1:1000){
x.0 <- rexp(n)
mydf.6[j,1:2] <- t.test(x.0, conf.level = alpha)$conf.int
boot.mean <- replicate(10000, mean(sample(x.0, replace = T)))
mydf.6[j,3:4] <- quantile(boot.mean,c((1-alpha)/2, (1+alpha)/2))
}
hist(boot.mean, breaks = 40)head(mydf.6)## L.t U.t L.boot U.boot
## 1 0.2372943 1.116045 0.3238111 1.076931
## 2 0.3470579 1.689544 0.4762477 1.629433
## 3 0.8799204 2.342777 1.0403411 2.263653
## 4 0.3944768 2.503756 0.6287254 2.428744
## 5 0.1087424 1.147443 0.2303944 1.106497
## 6 0.4996889 1.404923 0.5868083 1.354018
mydf.6$t.cover <- (mydf.6$L.t < 1 & mydf.6$U.t > 1)
mydf.6$boot.cover <- (mydf.6$L.boot < 1 & mydf.6$U.boot > 1)
mean(mydf.6$t.cover) #84% of the time it is in the interval## [1] 0.862
mean(mydf.6$boot.cover) #80% of the time it it in the interval## [1] 0.827
Still it looks like there is more coverage from t-test.
CI for Proportions for one sample
Example 11
Gallup poll data of October 2017: Observe 134 successes (Yes answers) in 1028 trials (interviews). Gallup poll data of October 2018: Observe 214 successes (Yes answers) in 1021 trials (interviews).
n.2017 = 1028
yes.2017 = 134
prop.test(yes.2017, n.2017)##
## 1-sample proportions test with continuity correction
##
## data: yes.2017 out of n.2017, null probability 0.5
## X-squared = 560.39, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1106849 0.1528326
## sample estimates:
## p
## 0.1303502
# Same thing for 214 successes in 1021 trials:
n.2018 = 1021
yes.2018 = 214
prop.test(yes.2018, n.2018)$conf.int # R computes score intervals.## [1] 0.1852772 0.2361393
## attr(,"conf.level")
## [1] 0.95
Example 12
Compute Wald confidence intervals by hand for comparison.
p.hat = yes.2017/n.2017
SE = sqrt( p.hat*(1-p.hat)/n.2017 )
zstar = qnorm(.975)
c(p.hat - zstar*SE, p.hat + zstar*SE)## [1] 0.1097686 0.1509318
prop.test(yes.2017, n.2017)$conf.int## [1] 0.1106849 0.1528326
## attr(,"conf.level")
## [1] 0.95
CI for the difference of two proportions.
prop.test(c(yes.2018,yes.2017),c(n.2018,n.2017))##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(yes.2018, yes.2017) out of c(n.2018, n.2017)
## X-squared = 22.258, df = 1, p-value = 2.383e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.04591606 0.11258042
## sample estimates:
## prop 1 prop 2
## 0.2095984 0.1303502
# The CI does not contains 0, so there is evidence from the CI
# that the two proportions are different.
# We can also perform a chi squared test to find out
# if the proportion of successes is the same in both populations.
# We must use the number of success and the number of failures to do this,
# not the number of successes and the number of trials.
chisq.test(matrix(c(yes.2018,yes.2017,
n.2018 - yes.2018,n.2017 - yes.2017),ncol = 2))##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: matrix(c(yes.2018, yes.2017, n.2018 - yes.2018, n.2017 - yes.2017), ncol = 2)
## X-squared = 22.258, df = 1, p-value = 2.383e-06
# Note that Chi-squared statistic and p-value are the same.Example 13: One-sided confidence bounds
- Make a 95% lower confidence bound for the mean birth weights of babies born in Alaska
mytest.AK.lower <- t.test(girlsAK, conf.level = .95,
alternative = "greater")
mytest.AK.lower##
## One Sample t-test
##
## data: girlsAK
## t = 38.421, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
## 3362.147 Inf
## sample estimates:
## mean of x
## 3516.35
- Make a 99% lower confidence bound for the difference of mean birth weights (Alaska - Wyoming)
mytest.diff.lower = t.test(Weight ~ State, data = Girls2004,
alternative = "greater", conf.level = .99)
mytest.diff.lower$conf.int## [1] 39.69792 Inf
## attr(,"conf.level")
## [1] 0.99
- Make a 95% upper confidence bound for approval rate for Congress in October 2018
prop.test(yes.2018, n.2018, alternative = "less")$conf.int## [1] 0.0000000 0.2318109
## attr(,"conf.level")
## [1] 0.95
Simple Linear Regression
Example 1: From the notes
library(ISLR)
library(MASS)
ad=read.csv("advertising.csv")
head(ad)## TV Radio Newspaper Sales
## 1 230.1 37.8 69.2 22.1
## 2 44.5 39.3 45.1 10.4
## 3 17.2 45.9 69.3 12.0
## 4 151.5 41.3 58.5 16.5
## 5 180.8 10.8 58.4 17.9
## 6 8.7 48.9 75.0 7.2
fit=lm(Sales ~ TV+Radio+Newspaper, data=ad)
summary(fit)##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = ad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3034 -0.8244 -0.0008 0.8976 3.7473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6251241 0.3075012 15.041 <2e-16 ***
## TV 0.0544458 0.0013752 39.592 <2e-16 ***
## Radio 0.1070012 0.0084896 12.604 <2e-16 ***
## Newspaper 0.0003357 0.0057881 0.058 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.662 on 196 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.9011
## F-statistic: 605.4 on 3 and 196 DF, p-value: < 2.2e-16
fit2=lm(Sales ~ TV+Radio, data=ad)
summary(fit2)##
## Call:
## lm(formula = Sales ~ TV + Radio, data = ad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3131 -0.8269 0.0095 0.9022 3.7484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.630879 0.290308 15.95 <2e-16 ***
## TV 0.054449 0.001371 39.73 <2e-16 ***
## Radio 0.107175 0.007926 13.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.657 on 197 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.9016
## F-statistic: 912.7 on 2 and 197 DF, p-value: < 2.2e-16
Cr_data= ISLR::Credit
head(Cr_data)## ID Income Limit Rating Cards Age Education Gender Student Married Ethnicity
## 1 1 14.891 3606 283 2 34 11 Male No Yes Caucasian
## 2 2 106.025 6645 483 3 82 15 Female Yes Yes Asian
## 3 3 104.593 7075 514 4 71 11 Male No No Asian
## 4 4 148.924 9504 681 3 36 11 Female No No Asian
## 5 5 55.882 4897 357 2 68 16 Male No Yes Caucasian
## 6 6 80.180 8047 569 4 77 10 Male No No Caucasian
## Balance
## 1 333
## 2 903
## 3 580
## 4 964
## 5 331
## 6 1151
fit3 = lm(Balance ~ Gender, data=Cr_data)
summary(fit3)##
## Call:
## lm(formula = Balance ~ Gender, data = Cr_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -529.54 -455.35 -60.17 334.71 1489.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 509.80 33.13 15.389 <2e-16 ***
## GenderFemale 19.73 46.05 0.429 0.669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 460.2 on 398 degrees of freedom
## Multiple R-squared: 0.0004611, Adjusted R-squared: -0.00205
## F-statistic: 0.1836 on 1 and 398 DF, p-value: 0.6685
contrasts(Cr_data$Gender)## Female
## Male 0
## Female 1
fit4 = lm(Balance ~ Ethnicity, data=Cr_data)
summary(fit4)##
## Call:
## lm(formula = Balance ~ Ethnicity, data = Cr_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -531.00 -457.08 -63.25 339.25 1480.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 531.00 46.32 11.464 <2e-16 ***
## EthnicityAsian -18.69 65.02 -0.287 0.774
## EthnicityCaucasian -12.50 56.68 -0.221 0.826
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 460.9 on 397 degrees of freedom
## Multiple R-squared: 0.0002188, Adjusted R-squared: -0.004818
## F-statistic: 0.04344 on 2 and 397 DF, p-value: 0.9575
contrasts(Cr_data$Ethnicity)## Asian Caucasian
## African American 0 0
## Asian 1 0
## Caucasian 0 1
Example 2
library(ISLR)
library(MASS)
names(Boston)## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
?Boston
plot(medv~lstat,data=Boston)lm.fit=lm(medv~lstat,data=Boston)
plot(medv~lstat,data=Boston)
abline(lm.fit, col="red")#or
attach(Boston)
lm.fit=lm(medv~lstat)
summary(lm.fit)##
## Call:
## lm(formula = medv ~ lstat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
names(lm.fit)## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)## (Intercept) lstat
## 34.5538409 -0.9500494
confint(lm.fit)# CI## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
plot(lm.fit$residuals)plot(lm.fit)Example 3
Data Science Question: Is the variable “energy” contributes for predicting “danceability” of the songs in for both musicians; Taylor Swift and Beyonce?
Danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.
Artists <- read.csv("Artists.csv",header = TRUE)Dancebilty and Energy of songs by Taylor Swift
set.seed(12)
library(caret)## Loading required package: lattice
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
Artists1 <-Artists #make a copy
ts<- Artists1[Artists1$artist_name=="Taylor Swift",]
names(ts)## [1] "X" "artist_name" "Valence"
## [4] "danceability" "energy" "loudness"
## [7] "speechiness" "acousticness" "liveness"
## [10] "tempo" "track_name" "album_name"
## [13] "album_release_year"
training.samples <- ts$danceability %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- ts[training.samples, ]
test.data <- ts[-training.samples, ]
dim(train.data)## [1] 960 13
fit1 =lm(danceability~energy, data=train.data)
summary(fit1)##
## Call:
## lm(formula = danceability ~ energy, data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42627 -0.06289 0.00801 0.06838 0.29897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.61000 0.01059 57.629 <2e-16 ***
## energy -0.03272 0.01749 -1.871 0.0616 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1108 on 958 degrees of freedom
## Multiple R-squared: 0.003642, Adjusted R-squared: 0.002602
## F-statistic: 3.501 on 1 and 958 DF, p-value: 0.06162
P-value is large, so at 5% significance level it seems like there is not a significant relationship between energy and Dancebilty in Taylor Swift’s songs.
The F statistic is small, also indicating there is no significant relationship between the predictor and the response.
Since \(R^2 \approx 0.002\), indicating the relationship is not strong. Since the estimated slope is negative, so is the relationship.
Dancebilty and Energy of songs by Beyonce
set.seed(12)
library(caret)
by<- Artists1[Artists1$artist_name=="Beyoncé",]
names(by)## [1] "X" "artist_name" "Valence"
## [4] "danceability" "energy" "loudness"
## [7] "speechiness" "acousticness" "liveness"
## [10] "tempo" "track_name" "album_name"
## [13] "album_release_year"
training.samples <- by$danceability %>%
createDataPartition(p = 0.8, list = FALSE)
train.data2 <- by[training.samples, ]
test.data2 <- by[-training.samples, ]
dim(train.data2)## [1] 537 13
fit2 =lm(danceability~energy, data=train.data2)
summary(fit2)##
## Call:
## lm(formula = danceability ~ energy, data = train.data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55779 -0.11707 0.00723 0.10903 0.34103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47185 0.02337 20.189 < 2e-16 ***
## energy 0.15401 0.03558 4.328 1.79e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1662 on 535 degrees of freedom
## Multiple R-squared: 0.03383, Adjusted R-squared: 0.03203
## F-statistic: 18.74 on 1 and 535 DF, p-value: 1.793e-05
P-value is very small, so at 5% significance level, there is a significant relationship between energy and Dancebilty in Taylor Swift’s songs.
The F statistic is large, also indicating there is a significant relationship between the predictor and the response.
Finally, our model equation can be written as follow:
\[dancebility = 0.47 + 0.15*energy \]
The confidence interval of the model coefficient can be extracted as follow:
confint(fit2)## 2.5 % 97.5 %
## (Intercept) 0.42593927 0.5177620
## energy 0.08411477 0.2239069
Model accuracy assessment:
As we have seen in simple linear regression, the overall quality of the model can be assessed by examining the R-squared (R2) and Residual Standard Error (RSE).
R-squared:
R^2 represents the proportion of variance, in the outcome variable y. An R2 value close to 1 indicates that the model explains a large portion of the variance in the outcome variable.
A problem with the R^2, is that, it will always increase when more variables are added to the model, even if those variables are only weakly associated with the response (James et al. 2014).
A solution is to adjust the R^2 by taking into account the number of predictor variables.
The adjustment in the “Adjusted R Square” value in the summary output is a correction for the number of x variables included in the prediction model.
Residual Standard Error (RSE) or sigma(“residual standard deviation”):
The RSE estimate gives a measure of error of prediction. The lower the RSE, the more accurate the model.
Predictions
predictions <- fit1 %>% predict(test.data)
#or
y_hat <- predict(fit1, test.data)predictions2 <- fit2 %>% predict(test.data2)
#or
y_hat2 <- predict(fit2, test.data2)Model performance
- Model 1
# (a) Prediction error, RMSE (Root mean square error)
RMSE(predictions, test.data$danceability) # RMSE{caret package}## [1] 0.107848
# (b) R-square ## R2{caret package}
R2(predictions, test.data$danceability)## [1] 0.01209772
- Model 2
# (a) Prediction error, RMSE (Root mean square error)
RMSE(predictions2, test.data2$danceability) # RMSE{caret package}## [1] 0.1691094
# (b) R-square ## R2{caret package}
R2(predictions2, test.data2$danceability)## [1] 0.04044137
** The prediction error RMSE of the model 1 is lower than the prediction error of the model2. Note that we are comparing completely different 2 models. If this is about the same model with different set of variables, this comparison will make sense.